Necessary Imports


In [32]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy
import scipy.cluster.hierarchy as hcluster
import numpy as np

Generate Data


In [26]:
# use random.randn to
# Create an array of the given shape and populate it with random samples from a uniform distribution over [0, 1).

# N points per cluster
N=100

#cluster 1
c1 = numpy.random.randn(N, 2)

# cluster 2
c2 = numpy.random.randn(N, 2)
c2 = c2 + 5 

# cluster 3
c3 = numpy.random.randn(N, 2)
c3 = c3 + 10

Visualize Data before Clustering


In [42]:
#Visualize the data before the clustering
data = np.concatenate((c1, c2, c3))
plt.scatter(*numpy.transpose(data), color='black')
plt.axis("equal")
plt.show()


Perform the clustering and see if we can visualize the clusters


In [47]:
# Now Perform the clustering

# inportant input:
# The threshold to apply when forming flat clusters.
thresh = 1.5

# Criterion: specifies the criterion for forming flat clusters. 
# Valid values are 
# -‘inconsistent’ (default), 
# -‘distance’, or 
# -‘maxclust’ 
# cluster formation algorithms. 
criterion = 'distance'

# get the cluster
clusters = hcluster.fclusterdata(data, thresh, criterion=criterion)
# clusters = [2 2 2... 3 3...2..1....1 1 1 ] 

# plotting
plt.scatter(*numpy.transpose(data), c=clusters)
plt.axis("equal")
title = "threshold: %f, number of clusters: %d" % (thresh, len(set(clusters)))
plt.title(title)
plt.show()



In [ ]: